library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(colorspace)
library(GGally) ; library(ggpubr) ; library(ggExtra)
library(WGCNA)
library(expss)
library(polycor)
library(biomaRt)
library(clusterProfiler) ; library(ReactomePA) ; library(DOSE) ; library(org.Hs.eg.db)
library(knitr) ; library(kableExtra)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
# Get colors from the ggplot palette
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n+1)
pal = hcl(h = hues, l = 65, c = 100)[1:n]
}
# Assign an HCL rainbow colour to each module
get_mod_colours = function(mods){
n = length(unique(mods))-1
set.seed(123) ; rand_order = sample(1:n)
mod_colors = c('white', gg_colour_hue(n)[rand_order])
names(mod_colors) = mods %>% table %>% names
return(mod_colors)
}
# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
# Load Gandal dataset
load('./../Data/preprocessedData/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
# WGCNA metrics
WGCNA_metrics = read.csv('./../Data/preprocessedData/WGCNA_metrics.csv')
# Updates genes_info with SFARI information and clusters
genes_info = genes_info %>% left_join(SFARI_genes, by = 'ID') %>%
left_join(datGenes %>% mutate(ID = rownames(.)) %>% dplyr::select(ID, hgnc_symbol), by = 'ID') %>%
dplyr::select(ID, hgnc_symbol, log2FoldChange, shrunken_log2FoldChange, significant, Neuronal) %>%
left_join(WGCNA_metrics, by = 'ID') %>% dplyr::select(-contains('pval'))
rm(dds, WGCNA_metrics)
Modules with a cluster-diagnosis correlation manitude larger than 0.9
plot_data = genes_info %>% dplyr::select(Module, module_number, MTcor) %>% distinct %>%
mutate(alpha = ifelse(abs(MTcor)>0.9, 1, 0.3))
top_modules = plot_data %>% arrange(desc(MTcor)) %>% filter(abs(MTcor) > 0.9) %>% pull(Module) %>% as.character
The 3 modules that fulfill this condition are clusters 20, 36, 45
ggplotly(plot_data %>% mutate(module_number=ifelse(module_number == max(module_number), 'No cluster',
paste('Cluster', module_number))) %>%
ggplot(aes(reorder(module_number, -MTcor), MTcor)) +
geom_bar(stat='identity', fill = plot_data$Module, alpha = plot_data$alpha) +
geom_hline(yintercept =c(0.9, -0.9), color = 'gray', linetype = 'dotted') +
xlab('Clusters')+ ylab('Cluster-diagnosis correlation') + theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)))
The modules consist mainly of points with high (absolute) values in PC2 (which we know is related to lfc), so this result is consistent with the high correlation between Module and Diagnosis, although some of the points with the highest PC2 values do not belong to these top modules
The genes belonging to the modules with the positive Module-Diagnosis correlation have positive LFC values and the genes belonging to the modules with the negative Module-Diagnosis correlation have negative values.
pca = datExpr %>% prcomp
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(genes_info %>% dplyr::select(ID,Module,module_number,gene.score,hgnc_symbol), by='ID') %>%
mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
mutate(ImportantModules_number = ifelse(ImportantModules == 'Others', 'Others',
paste('Cluster', genes_info$module_number[genes_info$Module==ImportantModules]))) %>%
mutate(color = ifelse(ImportantModules=='Others', 'gray', ImportantModules),
alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
gene_id = paste0(ID, ' (', hgnc_symbol, ')')) %>%
apply_labels(ImportantModules_number = 'Top Clusters')
cro(plot_data$ImportantModules_number)
|  #Total | |
|---|---|
|  Top Clusters | |
| Â Â Â Cluster 20Â | 562 |
| Â Â Â Cluster 36Â | 623 |
| Â Â Â Cluster 45Â | 97 |
|    Others | 14835 |
|    #Total cases | 16117 |
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) +
geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
ggtitle('Genes belonging to the clusters with the strongest relation to ASD'))
rm(pca)
Since these modules have the strongest relation to autism, this pattern should be reflected in their model eigengenes, having two different behaviours for the samples corresponding to autism and the ones corresponding to control
In all cases, the Eigengenes separate the behaviour between autism and control samples very clearly (p-value < \(10^{-4}\)). Modules with positive Module-Diagnosis correlation have higher eigengenes in the ASD samples and Modules with a negative correlation, in the Control samples
plot_EGs = function(module){
plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME', module)],
'Diagnosis' = datMeta$Diagnosis)
p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1],
' (MTcor=',round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
stat_compare_means(method = 't.test', method.args = list(var.equal = FALSE), label = 'p.signif',
ref.group = 'ASD') +
ylab('Cluster Eigengenes') + theme_minimal() + theme(legend.position='none')#, plot.margin = margin(0,0,0,2,'cm'))
return(p)
}
# Calculate Module Eigengenes
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)
p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])
grid.arrange(p1, p2, p3, nrow=1)
rm(plot_EGs, ME_object, MEs, p1, p2, p3)
In the WGCNA pipeline, the most representative genes of each module are selected based on having a high module membership as well as a high (absolute) gene significance, so I’m going to do the same
SFARI Genes don’t seem to be more representative than the rest of the genes
create_plot = function(module){
plot_data = genes_info %>% dplyr::select(ID, paste0('MM.',gsub('#', '', module)), GS, gene.score) %>%
filter(genes_info$Module==module)
colnames(plot_data)[2] = 'MM'
SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='Others'])))
p = ggplotly(plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI',
as.character(gene.score))) %>%
ggplot(aes(MM, GS, color=gene.score)) +
geom_point(alpha=0.5, aes(ID=ID)) + xlab('Cluster Membership') + ylab('Gene Significance') +
ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], ' (MTcor = ',
round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal())
# For thesis
# p = plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI',
# ifelse(gene.score=='Neuronal', as.character(gene.score),
# paste('Score',as.character(gene.score))))) %>%
# mutate(gene.score = factor(gene.score, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal',
# 'Not in SFARI')),
# alpha = ifelse(grepl('Score', gene.score), 1, 0)) %>%
# ggplot(aes(MM, GS, color=gene.score, shape = gene.score)) + geom_point(aes(alpha = alpha)) +
# xlab('Cluster Membership') + ylab('Gene Significance') + scale_alpha_binned(range = c(0.5, 0.9)) +
# scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal() +
# labs(color = 'SFARI Score', shape = 'SFARI Score') + guides(alpha = FALSE)
# if(module != top_modules[length(top_modules)]) {p = p + theme(legend.position = 'none')}
return(p)
}
create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)
Ordered by \(\frac{MM+|GS|}{2}\)
There aren’t that many SFARI genes in the top genes of the modules
create_table = function(module){
top_genes = genes_info %>% dplyr::select(ID, hgnc_symbol, paste0('MM.',gsub('#','',module)), GS, gene.score) %>%
filter(genes_info$Module==module) %>% dplyr::rename('MM' = paste0('MM.',gsub('#','',module))) %>%
mutate(Relevance = (MM+abs(GS))/2,
gene.score = ifelse(gene.score == 'Others', 'Not in SFARI', as.character(gene.score))) %>%
arrange(by=-Relevance) %>% top_n(10) %>%
dplyr::rename('Gene Symbol' = hgnc_symbol, 'SFARI Score' = gene.score)
return(top_genes)
}
top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])
kable(top_genes[[1]] %>% dplyr::select(-ID),
caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[1]][1],
' (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[1]][1],2),')')) %>%
kable_styling(full_width = F)
| Gene Symbol | MM | GS | SFARI Score | Relevance |
|---|---|---|---|---|
| MCL1 | 0.8325746 | 0.9067802 | Not in SFARI | 0.8696774 |
| ITGA5 | 0.8372503 | 0.8588163 | Not in SFARI | 0.8480333 |
| PXN | 0.8158056 | 0.8682115 | Not in SFARI | 0.8420086 |
| SRGAP1 | 0.7298273 | 0.9211833 | Not in SFARI | 0.8255053 |
| CFLAR | 0.8063163 | 0.8155733 | Not in SFARI | 0.8109448 |
| RREB1 | 0.7626458 | 0.8430492 | Not in SFARI | 0.8028475 |
| MYOF | 0.7685244 | 0.8326888 | Not in SFARI | 0.8006066 |
| SERPINE1 | 0.8039283 | 0.7893995 | 3 | 0.7966639 |
| OLFML2B | 0.7317113 | 0.8526315 | Not in SFARI | 0.7921714 |
| LATS2 | 0.7354979 | 0.8487660 | Not in SFARI | 0.7921319 |
kable(top_genes[[2]] %>% dplyr::select(-ID),
caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[2]][1],
' (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[2]][1],2),')')) %>%
kable_styling(full_width = F)
| Gene Symbol | MM | GS | SFARI Score | Relevance |
|---|---|---|---|---|
| EPS8 | 0.7997752 | 0.8310258 | Not in SFARI | 0.8154005 |
| YAP1 | 0.7535539 | 0.8480815 | Not in SFARI | 0.8008177 |
| TOB2 | 0.7257486 | 0.8677757 | Not in SFARI | 0.7967622 |
| FOXO1 | 0.7358202 | 0.8502902 | Neuronal | 0.7930552 |
| PARD3B | 0.7719367 | 0.8139975 | 2 | 0.7929671 |
| CLIC1 | 0.7415732 | 0.7915367 | Not in SFARI | 0.7665550 |
| ACKR3 | 0.7658517 | 0.7522397 | Not in SFARI | 0.7590457 |
| FAM129B | 0.6339106 | 0.8683425 | Not in SFARI | 0.7511266 |
| IL1R1 | 0.7554235 | 0.7442369 | Not in SFARI | 0.7498302 |
| SLC16A9 | 0.7597334 | 0.7022276 | Not in SFARI | 0.7309805 |
kable(top_genes[[3]] %>% dplyr::select(-ID),
caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[3]][1],
' (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[3]][1],2),')')) %>%
kable_styling(full_width = F)
| Gene Symbol | MM | GS | SFARI Score | Relevance |
|---|---|---|---|---|
| RNF157 | 0.8841689 | -0.8478223 | Not in SFARI | 0.8659956 |
| LIMK1 | 0.8662292 | -0.8615976 | Neuronal | 0.8639134 |
| MAP3K9 | 0.8066583 | -0.8862639 | Not in SFARI | 0.8464611 |
| CPLX1 | 0.8479632 | -0.8259295 | Neuronal | 0.8369463 |
| PNKD | 0.8356434 | -0.8331841 | Not in SFARI | 0.8344138 |
| VAMP1 | 0.8505980 | -0.8010863 | Neuronal | 0.8258422 |
| SHD | 0.7828133 | -0.8622930 | Not in SFARI | 0.8225532 |
| KATNB1 | 0.8528400 | -0.7891769 | Neuronal | 0.8210084 |
| UBE2M | 0.7887791 | -0.8492042 | Neuronal | 0.8189916 |
| LYNX1 | 0.8177942 | -0.8009674 | Not in SFARI | 0.8093808 |
rm(create_table, i)
Most of the top genes, idenpendently of the cluster, have very high absolute values for the 2nd principal component
pca = datExpr %>% prcomp
ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)
plot_data = data.frame('ID'=rownames(datExpr), 'gene_name' = genes_info$hgnc_symbol,
'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(genes_info, by='ID') %>% dplyr::select(ID, PC1, PC2,gene_name, Module, gene.score) %>%
mutate(color = ifelse(Module %in% top_modules, as.character(Module), '#cccccc')) %>%
mutate(alpha = ifelse(color %in% top_modules & ID %in% ids, 1,
ifelse(color %in% top_modules, 0.25, 0.1)))
plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) +
geom_text(aes(label=ifelse(ID %in% ids,as.character(gene_name),'')),
color = plot_data$color, size = 2.5, hjust = 0, vjust = 0) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
theme_minimal() + ggtitle('Most relevant genes for top clusters')
rm(pca, tg, plot_data)
30/30 genes are differentially expressed
2/30 genes are differentially expressed (SERPINE1, PARD3B)
Level of expression by Diagnosis for top genes, ordered by relevance (as defined above): There is a visible difference in level of expression between diagnosis groups in all of these genes
create_plot = function(i){
plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>%
melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
left_join(top_genes[[i]], by='ID') %>%
left_join(datMeta %>% dplyr::select(Dissected_Sample_ID, Diagnosis),
by = c('variable'='Dissected_Sample_ID')) %>% arrange(desc(Relevance))
p = ggplotly(plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`,
levels=unique(plot_data$`Gene Symbol`), ordered=T)) %>%
ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
ggtitle(paste0('Top genes for cluster ',
genes_info$module_number[genes_info$Module==top_modules[i]][1],
' (MTcor = ',
round(genes_info$MTcor[genes_info$Module == top_modules[i]][1],2), ')')) +
xlab('') + ylab('Level of Expression') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)))
# For thesis
# p = plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`,
# levels=rev(unique(plot_data$`Gene Symbol`)), ordered=T)) %>%
# ggplot(aes(external_gene_id, value, fill=Diagnosis)) +
# geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
# xlab('') + ylab('Level of expression') + theme_minimal() + coord_flip()
# if(i < length(top_modules)) {p = p + theme(legend.position = 'none')}
return(p)
}
create_plot(1)
create_plot(2)
create_plot(3)
rm(create_plot)
Using ORA, as it was done in the previous section and selecting as top clusters the ones with an adjusted p-value lower than \(10^{-3}\) (enrichment higher than 0.999)
# Calculate % and ORA of SFARI Genes in each module
#modules = unique(genes_info$Module[genes_info$Module!='gray']) %>% as.character
modules = unique(genes_info$Module) %>% as.character
# We need the entrez ID of the genes for this
getinfo = c('ensembl_gene_id','entrezgene')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'),
values=genes_info$ID, mart=mart) %>%
left_join(genes_info %>% dplyr::select(ID, Module,gene.score), by = c('ensembl_gene_id'='ID'))
# We need to build a term2gene dataframe with the genes and their SFARI Scores
term2gene = biomart_output %>% mutate(term = ifelse(gene.score == 'Others', 'Others', 'SFARI'),
'gene' = entrezgene) %>% dplyr::select(term, gene) %>% distinct
enrichment_data = data.frame('Module' = modules, 'size' = 0, 'perc_SFARI' = 0,
'pval_ORA' = 0, 'padj_ORA' = 0)
for(i in 1:length(modules)){
module = modules[i]
genes_in_module = biomart_output$entrezgene[biomart_output$Module==module]
ORA_module = enricher(gene = genes_in_module, universe = biomart_output$entrezgene %>% as.character,
pAdjustMethod = 'bonferroni', TERM2GENE = term2gene,
pvalueCutoff = 1, qvalueCutoff = 1, maxGSSize = 50000) %>%
data.frame %>% dplyr::select(-geneID,-Description)
ORA_pval = ifelse('SFARI' %in% ORA_module$ID, ORA_module$pvalue[ORA_module$ID=='SFARI'], 1)
ORA_padj = ifelse('SFARI' %in% ORA_module$ID, ORA_module$p.adjust[ORA_module$ID=='SFARI'], 1)
enrichment_data[i,-1] = c(length(genes_in_module),
mean(genes_info$gene.score[genes_info$Module==module]!='Others'),
ORA_pval, ORA_padj)
}
enrichment_data = enrichment_data %>%
left_join(genes_info %>% dplyr::select(Module) %>% unique, by = 'Module')
genes_info = genes_info %>% left_join(enrichment_data, by = 'Module')
plot_data = genes_info %>% dplyr::select(Module, module_number, MTcor, size, padj_ORA) %>% distinct %>%
mutate(alpha = ifelse(abs(1-padj_ORA)>0.999, 1, 0.3))
top_modules = plot_data %>% arrange(padj_ORA) %>% filter((1-padj_ORA) > 0.999) %>% pull(Module) %>% as.character
rm(i, module, genes_in_module, ORA_module, ORA_pval, ORA_padj, getinfo, mart, term2gene)
The 3 modules that fulfill this condition are clusters 7, 22, 39
ggplotly(plot_data %>% ggplot(aes(MTcor, padj_ORA, size=size)) +
geom_point(color=plot_data$Module, alpha=0.5, aes(id=module_number)) +
geom_hline(yintercept = 0.001, color = 'gray', linetype = 'dotted') +
xlab('Cluster-diagnosis Correlation') + ylab('Corrected p-values') + scale_y_log10() +
theme_minimal() + theme(legend.position = 'none') +
ggtitle('Clusters Significantly Enriched in SFARI Genes'))
pca = datExpr %>% prcomp
plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(genes_info %>% dplyr::select(ID,Module,module_number,gene.score,hgnc_symbol), by='ID') %>%
mutate(ImportantModules = ifelse(Module %in% top_modules, as.character(Module), 'Others')) %>%
mutate(ImportantModules_number = ifelse(ImportantModules == 'Others', 'Others',
paste('Cluster', genes_info$module_number[genes_info$Module==ImportantModules]))) %>%
mutate(color = ifelse(ImportantModules=='Others', 'gray', ImportantModules),
alpha = ifelse(ImportantModules=='Others', 0.2, 0.4),
gene_id = paste0(ID, ' (', hgnc_symbol, ')')) %>%
apply_labels(ImportantModules_number = 'Top Clusters')
cro(plot_data$ImportantModules_number)
|  #Total | |
|---|---|
|  Top Clusters | |
| Â Â Â Cluster 22Â | 67 |
| Â Â Â Cluster 39Â | 360 |
| Â Â Â Cluster 7Â | 1504 |
|    Others | 14186 |
|    #Total cases | 16117 |
ggplotly(plot_data %>% ggplot(aes(PC1, PC2, color=ImportantModules)) +
geom_point(alpha=plot_data$alpha, color=plot_data$color, aes(ID=gene_id)) + theme_minimal() +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
ggtitle('Genes belonging to the clusters with the strongest relation to ASD'))
rm(pca)
In all cases, the Eigengenes separate the behaviour between autism and control samples very clearly (p-value < \(10^{-4}\)). Modules with positive Module-Diagnosis correlation have higher eigengenes in the ASD samples and Modules with a negative correlation, in the Control samples
plot_EGs = function(module){
plot_data = data.frame('ID' = rownames(MEs), 'MEs' = MEs[,paste0('ME', module)],
'Diagnosis' = datMeta$Diagnosis)
p = plot_data %>% ggplot(aes(Diagnosis, MEs, fill=Diagnosis)) +
geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) +
ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1],
' (MTcor=',round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
stat_compare_means(method = 't.test', method.args = list(var.equal = FALSE), label = 'p.signif',
ref.group = 'ASD') +
ylab('Cluster Eigengenes') + theme_minimal() + theme(legend.position='none')#, plot.margin = margin(0,0,0,2,'cm'))
return(p)
}
# Calculate Module Eigengenes
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)
p1 = plot_EGs(top_modules[1])
p2 = plot_EGs(top_modules[2])
p3 = plot_EGs(top_modules[3])
grid.arrange(p1, p2, p3, nrow=1)
rm(plot_EGs, ME_object, MEs, p1, p2, p3)
In the WGCNA pipeline, the most representative genes of each module are selected based on having a high module membership as well as a high (absolute) gene significance, so I’m going to do the same
SFARI Genes don’t seem to be more representative than the rest of the genes
create_plot = function(module){
plot_data = genes_info %>% dplyr::select(ID, paste0('MM.',gsub('#', '', module)), GS, gene.score) %>%
filter(genes_info$Module==module)
colnames(plot_data)[2] = 'MM'
SFARI_colors = as.numeric(names(table(as.character(plot_data$gene.score)[plot_data$gene.score!='Others'])))
p = ggplotly(plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI',
as.character(gene.score))) %>%
ggplot(aes(MM, GS, color=gene.score)) +
geom_point(alpha=0.5, aes(ID=ID)) + xlab('Cluster Membership') + ylab('Gene Significance') +
ggtitle(paste0('Cluster ', genes_info$module_number[genes_info$Module==module][1], ' (MTcor = ',
round(genes_info$MTcor[genes_info$Module == module][1],2),')')) +
scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal())
# For thesis
# p = plot_data %>% mutate(gene.score = ifelse(gene.score =='Others', 'Not in SFARI',
# ifelse(gene.score=='Neuronal', as.character(gene.score),
# paste('Score',as.character(gene.score))))) %>%
# mutate(gene.score = factor(gene.score, levels = c('Score 1', 'Score 2', 'Score 3', 'Neuronal',
# 'Not in SFARI')),
# alpha = ifelse(grepl('Score', gene.score), 1, 0)) %>%
# ggplot(aes(MM, GS, color=gene.score, shape = gene.score)) + geom_point(aes(alpha = alpha)) +
# xlab('Cluster Membership') + ylab('Gene Significance') + scale_alpha_binned(range = c(0.5, 0.9)) +
# scale_color_manual(values=SFARI_colour_hue(r=c(SFARI_colors,7))) + theme_minimal() +
# labs(color = 'SFARI Score', shape = 'SFARI Score') + guides(alpha = FALSE)
# if(module != top_modules[length(top_modules)]) {p = p + theme(legend.position = 'none')}
return(p)
}
create_plot(top_modules[1])
create_plot(top_modules[2])
create_plot(top_modules[3])
rm(create_plot)
Ordered by \(\frac{MM+|GS|}{2}\)
There aren’t that many SFARI genes in the top genes of the modules
create_table = function(module){
top_genes = genes_info %>% dplyr::select(ID, hgnc_symbol, paste0('MM.',gsub('#','',module)), GS, gene.score) %>%
filter(genes_info$Module==module) %>% dplyr::rename('MM' = paste0('MM.', gsub('#','',module))) %>%
mutate(Relevance = (MM+abs(GS))/2,
gene.score = ifelse(gene.score == 'Others', 'Not in SFARI', as.character(gene.score))) %>%
arrange(by=-Relevance) %>% top_n(10) %>%
dplyr::rename('Gene Symbol' = hgnc_symbol, 'SFARI Score' = gene.score)
return(top_genes)
}
top_genes = list()
for(i in 1:length(top_modules)) top_genes[[i]] = create_table(top_modules[i])
kable(top_genes[[1]] %>% dplyr::select(-ID),
caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[1]][1],
' (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[1]][1],2),')')) %>%
kable_styling(full_width = F)
| Gene Symbol | MM | GS | SFARI Score | Relevance |
|---|---|---|---|---|
| MAPK9 | 0.8968422 | -0.9392328 | Neuronal | 0.9180375 |
| TRIM37 | 0.8936259 | -0.9293330 | Not in SFARI | 0.9114795 |
| PREPL | 0.8907183 | -0.9035408 | Not in SFARI | 0.8971295 |
| NAP1L5 | 0.8751900 | -0.9036163 | Not in SFARI | 0.8894031 |
| TTBK2 | 0.9009532 | -0.8742880 | Not in SFARI | 0.8876206 |
| EIF5A2 | 0.8516030 | -0.9108883 | Not in SFARI | 0.8812457 |
| PRKCE | 0.8446185 | -0.9076184 | Neuronal | 0.8761185 |
| DIRAS1 | 0.8525683 | -0.8958339 | Not in SFARI | 0.8742011 |
| ATP6V1C1 | 0.8400817 | -0.8957404 | Not in SFARI | 0.8679110 |
| NAPB | 0.9029772 | -0.8290573 | Not in SFARI | 0.8660173 |
kable(top_genes[[2]] %>% dplyr::select(-ID),
caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[2]][1],
' (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[2]][1],2),')')) %>%
kable_styling(full_width = F)
| Gene Symbol | MM | GS | SFARI Score | Relevance |
|---|---|---|---|---|
| FN3K | 0.8030437 | -0.8000068 | Not in SFARI | 0.8015253 |
| KIF1A | 0.8342445 | -0.7543228 | Neuronal | 0.7942837 |
| CELSR2 | 0.8551589 | -0.7116076 | Neuronal | 0.7833833 |
| STX1B | 0.7924147 | -0.7626449 | Neuronal | 0.7775298 |
| EPB41L1 | 0.8101531 | -0.7304887 | Not in SFARI | 0.7703209 |
| PRKACA | 0.8795831 | -0.6594748 | Neuronal | 0.7695290 |
| SYN1 | 0.8678907 | -0.6406826 | 3 | 0.7542866 |
| CAMTA2 | 0.8069597 | -0.6980794 | Not in SFARI | 0.7525196 |
| WNK2 | 0.7500853 | -0.7517314 | Not in SFARI | 0.7509084 |
| MARCH6 | 0.7877729 | -0.6990101 | Not in SFARI | 0.7433915 |
kable(top_genes[[3]] %>% dplyr::select(-ID),
caption=paste0('Top 10 genes for Module ', genes_info$module_number[genes_info$Module==top_modules[3]][1],
' (MTcor = ', round(genes_info$MTcor[genes_info$Module == top_modules[3]][1],2),')')) %>%
kable_styling(full_width = F)
| Gene Symbol | MM | GS | SFARI Score | Relevance |
|---|---|---|---|---|
| SPRY2 | 0.7566443 | 0.5560984 | Neuronal | 0.6563714 |
| KIAA1549 | 0.7517636 | 0.5504036 | Not in SFARI | 0.6510836 |
| TET3 | 0.8120406 | 0.4852710 | Not in SFARI | 0.6486558 |
| DNAJB5 | 0.7399456 | 0.5226353 | Not in SFARI | 0.6312905 |
| DUSP6 | 0.8448647 | 0.4085318 | Neuronal | 0.6266982 |
| GAREM | 0.6976841 | 0.5472754 | Not in SFARI | 0.6224798 |
| TRIM9 | 0.7593863 | 0.4835355 | Not in SFARI | 0.6214609 |
| EGR2 | 0.7291357 | 0.4856175 | Neuronal | 0.6073766 |
| TRIB1 | 0.6993757 | 0.5113890 | Not in SFARI | 0.6053823 |
| KDM5B | 0.6164204 | 0.5860347 | 2 | 0.6012275 |
rm(create_table, i)
Top genes don’t have PC2 values as high as they did with the top genes by cluster-diagnosis correlation
pca = datExpr %>% prcomp
ids = c()
for(tg in top_genes) ids = c(ids, tg$ID)
plot_data = data.frame('ID'=rownames(datExpr), 'gene_name' = genes_info$hgnc_symbol,
'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
left_join(genes_info, by='ID') %>% dplyr::select(ID, PC1, PC2,gene_name, Module, gene.score) %>%
mutate(color = ifelse(Module %in% top_modules, as.character(Module), '#cccccc')) %>%
mutate(alpha = ifelse(color %in% top_modules & ID %in% ids, 1,
ifelse(color %in% top_modules, 0.25, 0.1)))
plot_data %>% ggplot(aes(PC1, PC2)) + geom_point(alpha=plot_data$alpha, color=plot_data$color) +
geom_text(aes(label=ifelse(ID %in% ids,as.character(gene_name),'')),
color = plot_data$color, size = 2.5, hjust = 0, vjust = 0) +
xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],2),'%)')) +
ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],2),'%)')) +
theme_minimal() + ggtitle('Most relevant genes for top clusters')
rm(pca, tg, plot_data)
30/30 genes are differentially expressed
2/30 genes are differentially expressed (SYN1, KDM5B)
Level of expression by Diagnosis for top genes, ordered by relevance (as defined above): There is a visible difference in level of expression between diagnosis groups in all of these genes, but it’s not as strong as the differences in the top cluster by cluster-diagnosis correlation
create_plot = function(i){
plot_data = datExpr[rownames(datExpr) %in% top_genes[[i]]$ID,] %>% mutate('ID' = rownames(.)) %>%
melt(id.vars='ID') %>% mutate(variable = gsub('X','',variable)) %>%
left_join(top_genes[[i]], by='ID') %>%
left_join(datMeta %>% dplyr::select(Dissected_Sample_ID, Diagnosis),
by = c('variable'='Dissected_Sample_ID')) %>% arrange(desc(Relevance))
p = ggplotly(plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`,
levels=unique(plot_data$`Gene Symbol`), ordered=T)) %>%
ggplot(aes(external_gene_id, value, fill=Diagnosis)) + geom_boxplot() + theme_minimal() +
ggtitle(paste0('Top genes for cluster ',
genes_info$module_number[genes_info$Module==top_modules[i]][1],
' (MTcor = ',
round(genes_info$MTcor[genes_info$Module == top_modules[i]][1],2), ')')) +
xlab('') + ylab('Level of Expression') +
theme(axis.text.x = element_text(angle = 90, hjust = 1)))
# # For the thesis
# p = plot_data %>% mutate(external_gene_id=factor(`Gene Symbol`,
# levels=rev(unique(plot_data$`Gene Symbol`)), ordered=T)) %>%
# ggplot(aes(external_gene_id, value, fill=Diagnosis)) +
# geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) +
# xlab('') + ylab('Level of expression') + theme_minimal() + coord_flip()
# if(i < length(top_modules)) {p = p + theme(legend.position = 'none')}
return(p)
}
create_plot(1)
create_plot(2)
create_plot(3)
rm(create_plot)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] kableExtra_1.1.0 knitr_1.32 org.Hs.eg.db_3.8.2
## [4] AnnotationDbi_1.46.1 IRanges_2.18.3 S4Vectors_0.22.1
## [7] Biobase_2.44.0 BiocGenerics_0.30.0 DOSE_3.10.2
## [10] ReactomePA_1.28.0 clusterProfiler_3.12.0 biomaRt_2.40.5
## [13] polycor_0.7-10 expss_0.10.2 WGCNA_1.69
## [16] fastcluster_1.1.25 dynamicTreeCut_1.63-1 ggExtra_0.9
## [19] ggpubr_0.2.5 magrittr_2.0.1 GGally_1.5.0
## [22] colorspace_2.0-2 gridExtra_2.3 viridis_0.5.1
## [25] viridisLite_0.4.0 RColorBrewer_1.1-2 dendextend_1.13.4
## [28] plotly_4.9.2 glue_1.4.2 reshape2_1.4.4
## [31] forcats_0.5.0 stringr_1.4.0 dplyr_1.0.1
## [34] purrr_0.3.4 readr_1.3.1 tidyr_1.1.0
## [37] tibble_3.1.2 ggplot2_3.3.5 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.1 tidyselect_1.1.0
## [3] RSQLite_2.2.0 htmlwidgets_1.5.1
## [5] grid_3.6.3 BiocParallel_1.18.1
## [7] munsell_0.5.0 codetools_0.2-16
## [9] preprocessCore_1.46.0 miniUI_0.1.1.1
## [11] withr_2.4.2 GOSemSim_2.10.0
## [13] highr_0.8 rstudioapi_0.11
## [15] ggsignif_0.6.0 labeling_0.4.2
## [17] urltools_1.7.3 GenomeInfoDbData_1.2.1
## [19] polyclip_1.10-0 bit64_4.0.5
## [21] farver_2.1.0 vctrs_0.3.8
## [23] generics_0.0.2 xfun_0.22
## [25] GenomeInfoDb_1.20.0 R6_2.5.0
## [27] doParallel_1.0.15 graphlayouts_0.7.0
## [29] locfit_1.5-9.4 DelayedArray_0.10.0
## [31] bitops_1.0-6 reshape_0.8.8
## [33] fgsea_1.10.1 gridGraphics_0.5-0
## [35] assertthat_0.2.1 promises_1.2.0.1
## [37] scales_1.1.1 ggraph_2.0.3
## [39] nnet_7.3-14 enrichplot_1.4.0
## [41] gtable_0.3.0 tidygraph_1.2.0
## [43] rlang_0.4.11 genefilter_1.66.0
## [45] splines_3.6.3 lazyeval_0.2.2
## [47] acepack_1.4.1 impute_1.58.0
## [49] broom_0.7.0 europepmc_0.4
## [51] checkmate_2.0.0 BiocManager_1.30.10
## [53] yaml_2.2.1 modelr_0.1.6
## [55] crosstalk_1.1.0.1 backports_1.1.8
## [57] httpuv_1.6.1 qvalue_2.16.0
## [59] Hmisc_4.4-0 tools_3.6.3
## [61] ggplotify_0.0.5 ellipsis_0.3.2
## [63] jquerylib_0.1.4 ggridges_0.5.2
## [65] Rcpp_1.0.6 plyr_1.8.6
## [67] zlibbioc_1.30.0 base64enc_0.1-3
## [69] progress_1.2.2 RCurl_1.98-1.2
## [71] prettyunits_1.1.1 rpart_4.1-15
## [73] cowplot_1.1.1 SummarizedExperiment_1.14.1
## [75] haven_2.2.0 ggrepel_0.8.2
## [77] cluster_2.1.0 fs_1.5.0
## [79] data.table_1.12.8 DO.db_2.9
## [81] triebeard_0.3.0 reprex_0.3.0
## [83] reactome.db_1.68.0 matrixStats_0.56.0
## [85] hms_0.5.3 mime_0.11
## [87] evaluate_0.14 xtable_1.8-4
## [89] XML_3.99-0.3 jpeg_0.1-8.1
## [91] readxl_1.3.1 compiler_3.6.3
## [93] crayon_1.4.1 htmltools_0.5.1.1
## [95] later_1.2.0 Formula_1.2-3
## [97] geneplotter_1.62.0 lubridate_1.7.4
## [99] DBI_1.1.0 tweenr_1.0.1
## [101] dbplyr_1.4.2 MASS_7.3-53
## [103] rappdirs_0.3.3 Matrix_1.2-18
## [105] cli_2.5.0 igraph_1.2.5
## [107] GenomicRanges_1.36.1 pkgconfig_2.0.3
## [109] rvcheck_0.1.8 foreign_0.8-76
## [111] xml2_1.2.5 foreach_1.5.0
## [113] annotate_1.62.0 bslib_0.2.5.1
## [115] XVector_0.24.0 webshot_0.5.2
## [117] rvest_0.3.5 digest_0.6.27
## [119] graph_1.62.0 rmarkdown_2.7
## [121] cellranger_1.1.0 fastmatch_1.1-0
## [123] htmlTable_1.13.3 curl_4.3
## [125] shiny_1.6.0 graphite_1.30.0
## [127] lifecycle_1.0.0 jsonlite_1.7.2
## [129] fansi_0.5.0 pillar_1.6.1
## [131] lattice_0.20-41 fastmap_1.1.0
## [133] httr_1.4.2 survival_3.2-7
## [135] GO.db_3.8.2 UpSetR_1.4.0
## [137] png_0.1-7 iterators_1.0.12
## [139] bit_4.0.4 ggforce_0.3.1
## [141] stringi_1.5.3 sass_0.4.0
## [143] blob_1.2.1 DESeq2_1.24.0
## [145] latticeExtra_0.6-29 memoise_1.1.0